// UBloom.cp
// UBloom.h
// ----------------------------------------------------------------------------------
// Bloom Filter class for the Spell Checker project.
//
// Note: This file is proprietary and confidential to Art Pollard
//	and Lextek Internation.  
// Copyright 1994 Art Pollard / LexTek International
//
// A bloom filter is an abstract data structure that stores and 
// compares strings based upon a hashed comparison.  Each 
// iteration through the bloom filter increases the probability
// that the 'hit' information is correct.  Thus a return of a no 
// hit may only be accurate to 80%.  We go through enough times 
// that we have a fairly good probability of a find (~99.95%).  
// See any book on data structures for more information.
//
// ----------------------------------------------------------------------------------
// History:
// 		Art Pollard			June 94
//			Original.  Simple bloom filter hid in a class acting 
//			as a module.
//		Clark Goble			08/09/94
//			Made the filter into an abstract C++ class.
// ----------------------------------------------------------------------------------

// ----------------------------------------------------------------------------------
// Includes


#ifndef _UBLOOM_
#include "UBloom.h"
#endif

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "UError.h"

#ifdef _DEBUG_SPELL_
//#include "UDebugWindow.h"
#endif

//#define TEST

// ----------------------------------------------------------------------------------
// Macros & Stuff

#define	bitset(buf,bit) (buf[(bit) >> 3] |= (1 << ((bit) & 7)))
#define	bitget(buf,bit) (((buf[(bit) >> 3] >> ((bit) & 7 )) & 1 ))

/*
	unsigned int random_8[256]= {101,228,62,96,182,91,57,172,205,104,32,173,124,64,
			     188,77,163,34,127,217,251,49,206,138,69,250,33,111,
			     187,166,98,35,234,195,70,157,21,117,20,76,39,61,128,
			     197,201,244,2,243,123,31,74,196,116,241,92,51,236,
			     230,247,218,255,83,162,4,141,60,220,73,202,146,15,6,
			     145,130,23,14,118,53,155,119,208,143,170,58,112,46,
			     9,16,246,216,184,154,18,29,68,252,90,198,207,183,50,
			     82,66,109,176,25,136,235,126,113,160,132,178,72,186,100,
			     13,5,125,121,237,254,19,3,59,227,41,85,97,89,110,30,99,
			     171,174,81,147,242,102,27,80,106,8,215,165,164,148,7,122,
			     24,67,161,192,137,225,95,37,103,180,26,94,177,142,200,139,
			     240,78,11,12,120,169,153,134,84,185,151,135,54,253,56,191,
			     211,144,79,133,47,175,105,38,131,194,149,52,204,214,75,1,
			     140,249,159,44,219,87,152,129,114,65,203,222,86,17,221,
			     71,209,45,167,22,55,238,239,226,248,179,107,168,150,63,158,
			     0,223,156,212,233,224,210,40,115,190,213,108,10,43,231,48,42,
			     28,181,229,245,193,36,93,232,199,189,88};
*/



// ----------------------------------------------------------------------------------
//	UBloom		- Bloom Filter Constructor
// ----------------------------------------------------------------------------------
// This routine allocates the filter and initializes all the values.
// Size is an estimate of the number of keys that will be added to the filter or
// the number of bytes for the filter.  The flag "how" determines which method
// you wish to use.  By_Key means by the number of keys you wish to add.  By_Byte
// means to make it a specific size in bytes (for reading from disk or to fit it
// into a specific sized portion of memory).  If you size it By_Key the constructor 
// then calculates an optimal size for the filter.  TheNumBits is the number of bits
// turned on during insertions/tests.  It is basically an accuracy factor as a bloom 
// filter is a probabilistic data structure.  The hash is done this number of times.
//
// Size of filter is either the max number of keys or the size of the filter in
// BYTES!

UBloom::UBloom(unsigned long SizeOfFilter, long theNumBits, short how)
{
	long	SizeInBits;	// Size of the filter in bits

	Filter = NULL;

	if ( how == By_Key )
	{
		// Calculate the optimal size
		//   N = BK / (- ln(.5)) where N is number of bits
		//	See Dr. Dobb's Nov 1990, pg 22
		// 	-1/ln(.5) = 1.422

		SizeInBits = (unsigned long) ceil( ( (double) theNumBits * (double) SizeOfFilter) * 1.422 );
		SizeInBits >>= 3;
		BloomSizeInBytes = SizeInBits;
		SizeInBits <<= 3;
		BloomSizeInBits = SizeInBits;
		Filter = (UInt8 *) ::malloc(BloomSizeInBytes);
		if (Filter) {	// non-null
			memset(Filter, 0, BloomSizeInBytes);
			NumBits = theNumBits;
			NumKeys = 0;
			OptNumKeys = SizeOfFilter;
		} // if
	} else {
		Filter = (UInt8 *) ::malloc(SizeOfFilter);
		if (Filter) {	// non-null
			memset(Filter, 0, SizeOfFilter);
			BloomSizeInBits = SizeOfFilter * 8;	// Number of bits
			BloomSizeInBytes = SizeOfFilter;
			NumBits = theNumBits;	// Number of bits per key
			NumKeys = 0;

			// Calculate the optimal number of keys that can be added to the filter
			// After this effeciency decays.  See Dr. Dobb Nov 1990 pg 22

			OptNumKeys = (unsigned long) ( ceil ( (double) BloomSizeInBits * 0.693 /
										(double) theNumBits));
		}
	} // if

	// An extremely goofy way around not having member constants in C++
	static unsigned char Temp[256]=
				{101,228,62,96,182,91,57,172,205,104,32,173,124,64,
				  188,77,163,34,127,217,251,49,206,138,69,250,33,111,
				  187,166,98,35,234,195,70,157,21,117,20,76,39,61,128,
				  197,201,244,2,243,123,31,74,196,116,241,92,51,236,
				  230,247,218,255,83,162,4,141,60,220,73,202,146,15,6,
				  145,130,23,14,118,53,155,119,208,143,170,58,112,46,
				  9,16,246,216,184,154,18,29,68,252,90,198,207,183,50,
				  82,66,109,176,25,136,235,126,113,160,132,178,72,186,100,
				  13,5,125,121,237,254,19,3,59,227,41,85,97,89,110,30,99,
				  171,174,81,147,242,102,27,80,106,8,215,165,164,148,7,122,
				  24,67,161,192,137,225,95,37,103,180,26,94,177,142,200,139,
				  240,78,11,12,120,169,153,134,84,185,151,135,54,253,56,191,
				  211,144,79,133,47,175,105,38,131,194,149,52,204,214,75,1,
				  140,249,159,44,219,87,152,129,114,65,203,222,86,17,221,
			     71,209,45,167,22,55,238,239,226,248,179,107,168,150,63,158,
			     0,223,156,212,233,224,210,40,115,190,213,108,10,43,231,48,42,
			     28,181,229,245,193,36,93,232,199,189,88};
     random_8 = (unsigned char *)Temp;
     
     static unsigned long Temp32[256]=
     		   {0x30c7b4dbL,0xc3c423aaL,0x84f9a85cL,0xe93c6ac4L,0xb2d7c116L,0x55e4b13fL,
				0x649f5d2eL,0xbc754b12L,0x8042bb58L,0xd21ae844L,0xdaa5bef1L,0x4cd9d785L,
				0x9eaeb674L,0x4ac3baeeL,0xa8667553L,0xb7c3bbbL,0x51587f68L,0xaaba5eb7L,
				0x22a8b575L,0xf3dbeL,0x15e1c563L,0x4e32478L,0xdbc0a395L,0xe61f5f6bL,
				0xd84fd5bfL,0xfc375221L,0x9be0ddebL,0xab5c27e9L,0x9652f2b6L,0x86121bfaL,
				0x8211a07bL,0xcfa96182L,0xcd59e88L,0x97f1cb79L,0x785b5049L,0x9025ab8bL,
				0x10181160L,0x31201277L,0xc1407393L,0x6f29052L,0xa2231cc5L,0xa627dee6L,
				0xf673e46cL,0x8b887ab5L,0x7c7b48f3L,0x891777f8L,0x837e43baL,0x2f54c454L,
				0x28313cd2L,0x3ab5eb6L,0x2e4e357dL,0x8729a2b8L,0xede3c237L,0xa34395cL,
				0x6a81dc3eL,0x6cf0c35fL,0x60c6ec80L,0x58c81e5bL,0x1433fdd7L,0x5ec1e6d5L,
				0x0b234a4L,0x623ab9c6L,0x20a7cf1fL,0xdf961f2dL,0x738cb3bcL,0x8e77bc8fL,
				0xb55f1747L,0x8c6f554cL,0x5ad518b2L,0x6f158370L,0xceca3e1aL,0x5daa6517L,
				0xfa8d0d1L,0x394524e2L,0xbbf4ca9dL,0xf4879c8cL,0xaf5983aL,0x94c22d8aL,
				0x5b2b87c0L,0xf5ddef1cL,0x7435f5afL,0x17a133acL,0xe06c9d71L,0x1a64c9e5L,
				0xe4562120L,0x79edae45L,0x6e22fc31L,0xcc194041L,0xea1bda76L,0xe1983defL,
				0xa5923f89L,0x9ae8532L,0x1bef284dL,0xb8afff7fL,0x9339565dL,0xc5dc8c24L,
				0x92e8444eL,0x8bdf81bL,0xbdcf686aL,0x779de2e0L,0x75fd2e29L,0x66ebb79fL,
				0xfb78855L,0x2a1c4642L,0x72f36a0L,0xdc747072L,0xacb7726eL,0xc4a4d6a5L,
				0x1e166d9L,0xf862a6ceL,0x42c9e92fL,0xd0a6ced9L,0xecb691eL,0xdeac9adcL,
				0x687afa13L,0xddb198eL,0x150736L,0x5fd2b67L,0x453f4d18L,0xfe4676c2L,
				0xb02ed22L,0xf75e9f48L,0xb6263728L,0x1d38549aL,0xb46edf7aL,0x54cd0c3L,
				0x53cdf311L,0xbaeaaadeL,0xd73e6ff9L,0xe5fba7a7L,0xff49809cL,0xeebf3962L,
				0xf1538bb3L,0x6b9ae164L,0x32a2230L,0x3e721af4L,0x3d705110L,0x7da0530L,
				0x468146L,0x33ce71edL,0x9fa2e5b0L,0x3419b6dL,0x85d34fa9L,0x24106790L,
				0xd92a94b9L,0x56bbf673L,0xe247dbL,0x5c4d1487L,0x2561967L,0xa184924aL,
				0xb13653cL,0x3b9bb86fL,0x4ee20a2L,0x99b2b19L,0x19e962aeL,0x5293c98L,
				0xe769ef7L,0x591ea569L,0xa9783a23L,0x8acc2f50L,0x4183c056L,0xb7994197L,
				0xd61d1d3L,0x577997e3L,0x955df714L,0x379cf9f6L,0xa7b9ccfeL,0x487f7b33L,
				0x12fe7883L,0x504beae8L,0xcae5a4b1L,0x214abd26L,0xe37d5acbL,0x2e62afbL,
				0xd530f096L,0x8f08e4L,0xe865bfdaL,0x7b8acdL,0x8895b0a6L,0x6dfa365L,
				0x23c591c9L,0xb96ada3L,0xf228892L,0x712f58f5L,0x3f7a1adL,0x7e867c1dL,
				0xf36713fdL,0xc759d37eL,0xdd63cd2bL,0x448b63caL,0x36b1fb59L,0xaf8066d3L,
				0xf0513c8L,0x40b0c8eaL,0xc23d38fcL,0x1394d8b4L,0x35d6edbdL,0x544442d6L,				0x4321c7a,0x4b2d9df,0x9c344a51,0xef857984,0x4d5a8a2a,0xcc2c43,
				0x9fc841L,0xbebcfe66L,0x98e27ea1L,0x9d8a3099L,0xd4da252cL,0x7ad099ffL,
				0x388fd13dL,0xb3608decL,0xd328257L,0x76976eccL,0x7f6b3122L,0xada329a8L,
				0x1f91c6d0L,0x816d60fL,0xa0d8e740L,0x6347d99bL,0x1c897434L,0x2becf43bL,
				0x2d99378L,0x694832abL,0x65f6d461L,0x47b4644fL,0xc9d8fe1L,0xec5728dL,
				0xc62c1527L,0x913b4986L,0x3fb6a9f0L,0x4f6add8L,0x1176439L,0x2cbe5c81L,
				0xc8ade0e4L,0x7071afd4L,0x3c556ccfL,0x1613f15aL,0x4982107cL,0xf9fff91L,
				0xa49e5965L,0x27d15bc7L,0x7b4189c1L,0x26684ebeL,0xd1e74c94L,0xc0f869eL,
				0xbfd44538L,0xebdfee5eL,0x671416e7L,0x2990e3ddL,0x61abb215L,0x8d8e5725L,
				0xcb24814bL,0x18de2635L,0xaef86bf2L,0xfdb3acdL};
				random_32 = (unsigned long *)Temp32;

	// Error Checking
	if (Filter == NULL)
	{	// error state
		ErrorFunc(eNo_Mem, SET);
	}
} // UBloom 


// ----------------------------------------------------------------------------------
//	~UBloom		- Bloom Filter Destructor
// ----------------------------------------------------------------------------------
// This routine deallocates the filter and initializes all the values.

UBloom::~UBloom()
{
	if(Filter != NULL) ::free(Filter);
	
} // ~UBloom

size_t
UBloom :: GetBloomSizeInBytes()
{
	return BloomSizeInBytes;
}


// ----------------------------------------------------------------------------------
//	Insert		- Inserts a word
// ----------------------------------------------------------------------------------
// This routine inserts a word into the bloom filter.  The length parameter is used
// to allow non-textual material to be inserted (ie. non-null terminated).  There
// is no return value as all it does is march through memory twiddling bits.
void
UBloom::Insert(char *Word, short length)
{
	register short counter2;
	register short counter;
	unsigned char CharHashValue;
	unsigned long LongHashValue;

	// Loop through all the bits in the filter
	for ( counter2 = 1; counter2 <= NumBits; counter2++ )
	{
		CharHashValue = 0;
		LongHashValue = 0;
		
		// Get a hash value
		
	    for ( counter = 0; counter < length; counter++ ) {
			  CharHashValue = random_8[(unsigned char) CharHashValue ^ Word[counter]];
			  LongHashValue = (LongHashValue ^ random_32[CharHashValue]);
			}
		
	    Word[0]++;
	    
		// Set the bit based upon the hash value
		 bitset(Filter,(LongHashValue % BloomSizeInBits));
	} // for 
	    
	// Since we have been incrementing the first character, set it back to normal
	Word[0]-= (unsigned char) ( NumBits );
	
	// Increment the insertion counter
	NumKeys++;
		
} // Insert


// ----------------------------------------------------------------------------------
//	Test		- Tests for a word being in the filter
// ----------------------------------------------------------------------------------
// Tests to see if a word or token had been inserted into the bloom filter.
// As the filter is probablistic in nature the return value of FOUND is not
// necessarily accurate.  The accuracy depends upon the number of keys that have
// been inserted into the filter and the number of 'probes' used to detect for
// an inserted word.  Those features are determined when an instance of the
// class is created.

short
UBloom::Test(char *Word, short length)
{
	register short counter2;
	register short counter;
	unsigned char CharHashValue;
	register unsigned long LongHashValue;
	unsigned char Bit;
	
	counter2 = 1;
	
	// Loop through all the bits in the filter until a bit is found that
	// isn't set.  (ie. don't bother going through all the bits if we
	// already found that it isn't there)
	
	do {

		 CharHashValue = 0;
		 LongHashValue = 0;
		// Calculate the hash
		for ( counter = 0; counter < length; counter++ ) {
			  CharHashValue = random_8[(unsigned char)CharHashValue ^ Word[counter]];
			  LongHashValue = (LongHashValue ^ random_32[CharHashValue]);
			}

		Word[0]++;
		counter2++;
		Bit = bitget(Filter,(LongHashValue % BloomSizeInBits));

	} while((Bit == 1) && (counter2 <= NumBits) );


	// As we have been incrementing the first char, reset it back to normal

	Word[0]-= ( counter2-1 );

	if (Bit == 1) return FOUND;
	else return NOTFOUND;

} // Test

// ----------------------------------------------------------------------------------
//	Reset		- Clears all the set bits in the filter
// ----------------------------------------------------------------------------------
// Clears the set bits within the bloom filter.  No values within the filter itself
// are changed.  Thus all previous insertions are delted and the filter is readied
// for a new insertion session.

void
UBloom::Reset()
{
	memset(Filter, 0, BloomSizeInBytes);
} // Reset


// ----------------------------------------------------------------------------------
//	FRead		- Reads a filter from a file
// ----------------------------------------------------------------------------------
// The file pointer that is passed to FRead needs to be pointed to an already
// opened file.  In addition an offset can be sent.  By default it is 0.

short
UBloom::FRead(Uio *InFile, long Offset)
{
	InFile->SetPos(Offset, SEEK_SET);
	if ( ErrorFunc(0, GET) < eNo_Err )
		return ( ErrorFunc(0, GET) );

	InFile->ReadData(Filter, BloomSizeInBytes);
	if ( ErrorFunc(0, GET) < eNo_Err )
		return ( ErrorFunc(0, GET) );

	return ( OK );

} // FRead



// ----------------------------------------------------------------------------------
//	FWrite		- Writes a filter to a file
// ----------------------------------------------------------------------------------
// The file pointer that is passed to FRead needs to be pointed to an already
// opened file.  In addition an offset can be sent.  By default it is 0.


short
UBloom::FWrite(Uio *OutFile, long Offset)
{
	ErrorFunc(0,SET);
	OutFile->SetPos(Offset, SEEK_SET);
	if ( ErrorFunc(0, GET) < eNo_Err )
		return ( ErrorFunc(0, GET) );

	OutFile->WriteData(Filter, BloomSizeInBytes);
	if ( ErrorFunc(0, GET) < eNo_Err )
		return ( ErrorFunc(0, GET) );

	return ( OK );

} // FWrite

int
UBloom::FWrite2(FILE *DataFile, long Offset) {
	ErrorFunc(0,SET);
	fseek(DataFile,Offset,SEEK_SET);
	//OutFile->SetPos(Offset, SEEK_SET);
	if ( ErrorFunc(0, GET) < eNo_Err )
		return ( ErrorFunc(0, GET) );

	fwrite(Filter,1,BloomSizeInBytes,DataFile);
	//OutFile->WriteData(Filter, BloomSizeInBytes);
	if ( ErrorFunc(0, GET) < eNo_Err )
		return ( ErrorFunc(0, GET) );
	return ( OK );
}


// ----------------------------------------------------------------------------------
//	Get_Insertions		- Gets the number of insertions to the filter
// ----------------------------------------------------------------------------------
// Returns the number of keys added thus far to the filter.  If this number is
// greater than the optimal number of keys then effeciency and accuracy are lost.
// ie. it behooves you to check it if you are using this data structure.

unsigned long
UBloom::Get_Insertions()
{
	return NumKeys;
}

// ----------------------------------------------------------------------------------
//	Get_OptKeys		- Gets the optimal number of insertions for a filter
// ----------------------------------------------------------------------------------
// Returns the optimal number of keys that can be added to the filter.  If the
// number of keys added is greater than the optimal number of keys then effeciency
// and accuracy are lost.  ie. it behooves you to check it if you are using this data
// structure.

unsigned long
UBloom::Get_OptKeys()
{
	return OptNumKeys;
}

// ----------------------------------------------------------------------------------
//	Dump		- Dumps the filter to the screen
// ----------------------------------------------------------------------------------
// This is used for debugging only and can (ought?) to be removed.
// NOTE:  System dependencies here! (although they are preprocessed out when not
//		debugging)

#ifdef _DEBUG_SPELL_

void
UBloom::Dump()
{
	unsigned short counter;
	unsigned short BitsSet =0;

	// Loop through all the bits
	for( counter=0; counter <= BloomSizeInBits; counter++){
		if( bitget(Filter,counter) !=0) {
			//UDebugWindow::Debug(" %u", counter);
			BitsSet++;
		}
	//UDebugWindow::Debug("\nBits Set = %u",BitsSet);
	//UDebugWindow::Debug("\nBloomSize = %ld",BloomSizeInBits);
	//UDebugWindow::Debug("\n");
	}
} // Dump

#endif

// ----------------------------------------------------------------------------------
// TESTING EXAMPLES
// ----------------------------------------------------------------------------------

#ifdef TEST

// ----------------------------------------------------------------------------------
// Main test
// ----------------------------------------------------------------------------------

void main()
{
   char Choice[40];
   short  Result;
   UBloom *BloomFilter;
   long NumBloomKeys;
   short TestResult;
   
   
   for(;;) {
     printf("\n\n1) Create Bloom Filter");
     printf("\n2) Insert Word into Bloom Filter");
     printf("\n3) Test Word to see if it is in the Bloom Filter");
     printf("\n4) Dump the Bloom Filter");
     printf("\n5) Save Bloom Filter");
     printf("\n6) Load Bloom Filter");
     printf("\n7) Get Num Insertions");
     printf("\n8) Get Opt Num Keys");
     printf("\n9) Reset Bloom Filter");
     printf("\n0) Quit");
     gets(Choice);
     Result = atoi(Choice);
     switch (Result) {
       case 1:
              printf("\nNum Bloom Keys :");
              gets(Choice);
              NumBloomKeys = atoi(Choice);
              BloomFilter = new UBloom(NumBloomKeys,5,By_Key);
              if(BloomFilter == NULL) printf("\nCreation of the Bloom Filter was unsuccessful");
              else printf("\nCreation was successful");
              break;
       case 2:
              printf("\nWord to Insert Into Bloom Filter :");
              gets(Choice);
              BloomFilter->Insert(Choice,strlen(Choice));
              printf("\nThe Word %s Was Inserted Into the Bloom Filter",Choice);
              break;
       case 3:
              printf("\nWord to Test Against Bloom Filter :");
              gets(Choice);
              TestResult = BloomFilter->Test(Choice,strlen(Choice));
              if(TestResult == FOUND) printf("\nThe Word Was Found!!!! Yeah!");
              else printf("\nThe %s Word Was NOTfound.... Sorry....",Choice);
              break;
       case 4:
              BloomFilter->Dump();
              break;
       case 5: 
       
       case 0: exit(1);
     }
   }
		
} // main

#endif // #ifdef _TEST_

